/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.fetcher; import net.nutch.net.protocols.Response; import net.nutch.net.protocols.http.BadHeaderLineException; import net.nutch.net.protocols.http.BadStatusLineException; import net.nutch.net.protocols.http.ChunkEOFException; import net.nutch.net.protocols.http.ChunkLengthParseException; import net.nutch.net.protocols.http.ContentLengthParseException; import net.nutch.net.protocols.http.DecompressionException; import net.nutch.net.protocols.http.Http; import net.nutch.net.protocols.http.HttpVersionException; import net.nutch.net.protocols.http.MiscHttpAccounting; import net.nutch.net.protocols.http.HttpResponse; import net.nutch.net.protocols.ftp.Ftp; import net.nutch.net.protocols.ftp.FtpResponse; import net.nutch.io.*; import net.nutch.util.*; import java.io.*; import java.net.*; import java.util.*; import java.util.logging.Logger; import java.util.logging.Level; import java.util.logging.Handler; /** * This class is a worker thread which polls the RequestScheduler for * requests and actually performs the fetch. */ public class FetcherThread extends Thread implements FetcherConstants { public static final Logger LOG = LogFormatter.getLogger("net.nutch.fetcher.FetcherThread"); public static final int DELAY_MS= 2 * 1000; private RequestScheduler scheduler; private boolean throttle; // 20040404, xing, do the same for http? private Ftp ftp; /** * Creates a new <code>FetcherThread</code> which will service * requests from the supplied <code>scheduler</code>. */ FetcherThread(RequestScheduler scheduler) { this.scheduler= scheduler; this.throttle= false; } /** * Polls the {@link RequestScheduler} for requests to service, * until {@link RequestScheduler#finishedRequests()} returns * <code>true</code>. */ public void run() { MiscHttpAccounting httpAccounting= new MiscHttpAccounting(); RequestRecord prevRequest= null; // 20040404, xing, do the same for http? //LOG.info("creating ftp"); ftp = new Ftp(); while (!scheduler.finishedRequests()) { RequestRecord request= null; // could also do Thread.stop() trick w/volatile synchronized (this) { if (throttle) { try { if (prevRequest != null) { scheduler.returnRequest(prevRequest, httpAccounting); prevRequest = null; } } catch (Exception e) { e.printStackTrace(); LOG.severe("Exception caught during call to" + " RequestScheduler.returnRequest()!"); } while (throttle) try { this.wait(); } catch (InterruptedException e) { } } } try { request= scheduler.returnRequestAndGetNext(prevRequest, httpAccounting); prevRequest= null; } catch (Exception e) { e.printStackTrace(); LOG.severe("Exception caught during call to" + " RequestScheduler.returnRequestAndGetNext()!"); } if (request == null) { try { Thread.sleep(DELAY_MS); } catch (InterruptedException e) { ; } continue; } URL url= request.getURL(); if (LOG.isLoggable(Level.FINE)) LOG.fine("Trying to fetch: " + url.toString()); InetAddress addr= request.getAddr(); Response response= null; httpAccounting.reset(); int httpVersion= request.getHttpVersion(); try { if ("http".equals(url.getProtocol())) { Http http = new Http(); http.setAgentString(scheduler.getAgentString()); // get HTTP response, don't follow redirects response = http.getRawResponse(url, addr, httpAccounting, httpVersion); } else if ("ftp".equals(url.getProtocol())) { //Ftp ftp = new Ftp(); response = ftp.getRawResponse(url, addr, httpAccounting, Http.HTTP_VER_NOTSET); } else { throw new IOException("Not an HTTP or FTP url:" + url); } } catch (ConnectException e) { request.setFailureReason(FAIL_CONNECTION_REFUSED); request.setHasFailed(true); } catch (SocketTimeoutException e) { request.setErrorReason(ERR_SOCKET_TIMEOUT); } catch (UnknownHostException e) { request.setFailureReason(FAIL_UNKNOWN_HOST); request.setHasFailed(true); } catch (NoRouteToHostException e) { request.setErrorReason(ERR_NO_ROUTE); } catch (SocketException e) { String msg= e.getMessage(); if (msg != null) { if (msg.indexOf("reset by peer") >= 0) { request.setErrorReason(ERR_RESET_BY_PEER); } else if (msg.indexOf("Network is unreachable") >= 0) { request.setErrorReason(ERR_NETWORK_UNREACHABLE); } else { request.setErrorReason(ERR_UNKNOWN); request.setErrorMessages(new String[] { e.toString()} ); } } else { request.setErrorReason(ERR_UNKNOWN); request.setErrorMessages(new String[] { e.toString()} ); } } catch (EOFException e) { request.setErrorReason(ERR_EOF_DURING_READ); } catch (IOException e) { String msg= e.getMessage(); if (msg != null) { if (msg.indexOf("Connection timed out") >= 0) { request.setErrorReason(ERR_CONNECTION_TIMED_OUT); } else if (msg.indexOf("Connection refused") >= 0) { request.setFailureReason(FAIL_CONNECTION_REFUSED); request.setHasFailed(true); } else { request.setErrorReason(ERR_UNKNOWN); request.setErrorMessages(new String[] { e.toString() } ); } } else { request.setErrorReason(ERR_UNKNOWN); request.setErrorMessages(new String[] { e.toString() } ); } } catch (BadStatusLineException e) { request.setErrorReason(ERR_BAD_STATUS_LINE); } catch (BadHeaderLineException e) { request.setErrorReason(ERR_BAD_HEADER_LINE); } catch (ContentLengthParseException e) { request.setErrorReason(ERR_BAD_CONTENT_LENGTH); } catch (ChunkLengthParseException e) { // HttpVersionException request.setErrorReason(ERR_CHUNKLEN_PARSE); // very conservative! always fall back to 1.0! httpVersion= Http.HTTP_VER_1_0; } catch (ChunkEOFException e) { // HttpVersionException request.setErrorReason(ERR_CHUNK_EOF); // very conservative! always fall back to 1.0! httpVersion= Http.HTTP_VER_1_0; } catch (DecompressionException e) { // HttpVersionException request.setErrorReason(ERR_DECOMPRESS); // very conservative! always fall back to 1.0! httpVersion= Http.HTTP_VER_1_0; } catch (HttpVersionException e) { request.setErrorReason(ERR_UNKNOWN); request.setErrorMessages(new String[] { e.toString()} ); // try to fall back by 1 http version httpVersion= Http.minHttpVersion(httpVersion - 1, Http.HTTP_VER_1_0); } catch (Exception e) { request.setErrorReason(ERR_UNKNOWN); request.setErrorMessages(new String[] { e.toString()} ); } request.setResponse(response); if (httpAccounting.getAddr() != null) { // set addr for HostQueue to cache request.setAddr(httpAccounting.getAddr()); } int servVers= httpAccounting.getServHttpVersion(); request.setHttpVersion( Http.minHttpVersion(httpVersion, servVers) ); if (LOG.isLoggable(Level.FINE)) LOG.fine("done request: " + url.toString()); prevRequest= request; } // return last request scheduler.returnRequestAndGetNext(prevRequest, httpAccounting); // 20040404, xing, do the same for http? //LOG.info("deleting ftp"); ftp = null; // force garbage collection? System.gc(); } /** * Causes this FetcherThread to stop working, after it finishes * any work it has outstanding and returns it. * * <p> * * <em>Note:</em>The thread must be <code>unthrottle()</code>'d * before it's <code>run()</code> method will complete. * */ public void throttle() { synchronized (this) { throttle= true; } } /** * Causes this FetcherThread to resume work after a call to * <code>throttle()</code>. */ public void unthrottle() { synchronized (this) { throttle= false; this.notify(); } } }